{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 将少数类均SMOTE到I（数据集样本数/类数）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
    "\n",
    "data = np.load('E:/IDS/cicdata/77/data_train.npy')\n",
    "label = np.load('E:/IDS/cicdata/77/label_train.npy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X=np.array(data)  #np.array将列表转换为数组，即去掉每个元素外面的中括号\n",
    "b=np.array(label)\n",
    "bb=b.reshape(b.shape[0],)     #reshape重新定义形状,此时标签数据是1维的\n",
    "y10 = np.int32(bb)  #将标签类型从浮点型换成整形"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted(Counter(y10).items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#使用imlbearn库中上采样方法中的SMOTE接口\n",
    "from imblearn.over_sampling import SMOTE\n",
    "import time\n",
    "time_start = time.time()\n",
    "\n",
    "guo = 132101  #需要过采样的样本数\n",
    "\n",
    "smo = SMOTE(ratio={2:guo,3:guo,4:guo,5:guo,6:guo,\n",
    "                   7:guo,8:guo,9:guo,10:guo,11:guo,12:guo,13:guo,14:guo,},random_state=42)\n",
    "\n",
    "X_smo, y_smo = smo.fit_sample(X, y10)    #对数据和标签进行SMOTE处理\n",
    "print(sorted(Counter(y_smo).items()))\n",
    "\n",
    "time_end = time.time()\n",
    "time = time_end - time_start\n",
    "print(\"time:\",time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_smo.shape[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 将多数类数据提取出来"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list0 = []  #正常类数据\n",
    "list1 = []  #1类数据\n",
    "list2 = []  #2-14数据\n",
    "list3 = [] #2-14类标签\n",
    "\n",
    "for i in range(X_smo.shape[0]):\n",
    "    if y_smo[i] == 0:\n",
    "        list0.append(X_smo[i])\n",
    "    elif y_smo[i] == 1:\n",
    "        list1.append(X_smo[i])\n",
    "    else:\n",
    "        list2.append(X_smo[i])\n",
    "        list3.append(y_smo[i])\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data0 = np.array(list0)  #np.array将列表转换为数组，即去掉每个元素外面的中括号\n",
    "data1 = np.array(list1)\n",
    "data2 = np.array(list2)\n",
    "label2_14 = np.array(list3)\n",
    "\n",
    "label214 = label2_14.reshape(label2_14.shape[0],)     #reshape重新定义形状,此时标签数据是1维的\n",
    "\n",
    "print(\"正常类数据形状：\",data0.shape)\n",
    "print(\"1类数据形状：\",data1.shape)  #使用函数查看类分布需要这种格式\n",
    "print(\"2-14类数据形状：\",data2.shape)\n",
    "print(\"2-14类标签形状：\",label214.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label214"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 将多数类数据聚成（总类数）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "import time\n",
    "time_start = time.time()\n",
    "\n",
    "estimator = KMeans(n_clusters=10)\n",
    "estimator.fit(data0)  # 聚类\n",
    "# estimator.fit(data1)\n",
    "\n",
    "time_end = time.time()\n",
    "time = time_end - time_start\n",
    "print(\"time:\",time)\n",
    "\n",
    "label_pred_0 = estimator.labels_  # 获取聚类标签\n",
    "# label_pred_1 = estimator.predict(data1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted(Counter(label_pred_0).items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_pred = label_pred_0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 从每个簇中选出一定量的数据组成新的多数类数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c0 = []\n",
    "c1 = []\n",
    "c2 = []\n",
    "c3 = []\n",
    "c4 = []\n",
    "c5 = []\n",
    "c6 = []\n",
    "c7 = []\n",
    "c8 = []\n",
    "c9 = []\n",
    "c10 = []\n",
    "c11 = []\n",
    "c12 = []\n",
    "c13 = []\n",
    "c14 = []\n",
    "\n",
    "# c9 = data0[label_pred == 9]\n",
    "# c7 = data0[label_pred == 7]\n",
    "\n",
    "# s0=s1=s2=s3=s4=s5=s6=s7=s8=s9=0\n",
    "s0=s1=s2=s3=s4=s5=s6=s7=s8=s9=s10=s11=s12=s13=s14=0\n",
    "\n",
    "for i in range(data0.shape[0]):\n",
    "    if label_pred[i] == 0:\n",
    "        c0.append(data0[i])\n",
    "        s0=s0+1\n",
    "    elif label_pred[i] == 1:\n",
    "        c1.append(data0[i])\n",
    "        s1=s1+1\n",
    "    elif label_pred[i] == 2:\n",
    "        c2.append(data0[i])\n",
    "        s2=s2+1\n",
    "    elif label_pred[i] == 3:\n",
    "        c3.append(data0[i])\n",
    "        s3=s3+1\n",
    "    elif label_pred[i] == 4:\n",
    "        c4.append(data0[i])\n",
    "        s4=s4+1\n",
    "    elif label_pred[i] == 5:\n",
    "        c5.append(data0[i])\n",
    "        s5=s5+1\n",
    "    elif label_pred[i] == 6:\n",
    "        c6.append(data0[i])\n",
    "        s6=s6+1\n",
    "    elif label_pred[i] == 7:\n",
    "        c7.append(data0[i])\n",
    "        s7=s7+1\n",
    "    elif label_pred[i] == 8:\n",
    "        c8.append(data0[i])\n",
    "        s8=s8+1\n",
    "    elif label_pred[i] == 9:\n",
    "        c9.append(data0[i])\n",
    "        s9=s9+1\n",
    "    elif label_pred[i] == 10:\n",
    "        c10.append(data0[i])\n",
    "        s10=s10+1\n",
    "    elif label_pred[i] == 11:\n",
    "        c11.append(data0[i])\n",
    "        s11=s11+1\n",
    "    elif label_pred[i] == 12:\n",
    "        c12.append(data0[i])\n",
    "        s12=s12+1\n",
    "    elif label_pred[i] == 13:\n",
    "        c13.append(data0[i])\n",
    "        s13=s13+1\n",
    "    elif label_pred[i] == 14:\n",
    "        c14.append(data0[i])\n",
    "        s14=s14+1\n",
    "        \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a=9405"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"删除前大小:\",len(c0))\n",
    "del c0[a:len(c0)]\n",
    "print(\"删除后大小:\",len(c0))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"删除前大小:\",len(c1))\n",
    "del c1[a:len(c1)]\n",
    "print(\"删除后大小:\",len(c1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"删除前大小:\",len(c2))\n",
    "del c2[a:len(c2)]\n",
    "print(\"删除后大小:\",len(c2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"删除前大小:\",len(c3))\n",
    "del c3[a:len(c3)]\n",
    "print(\"删除后大小:\",len(c3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"删除前大小:\",len(c4))\n",
    "del c4[a:len(c4)]\n",
    "print(\"删除后大小:\",len(c4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del c5[a:len(c5)]\n",
    "del c6[a:len(c6)]\n",
    "del c7[a:len(c7)]\n",
    "# del c8[a:len(c8)]\n",
    "del c9[a:len(c9)]\n",
    "del c10[a:len(c10)]\n",
    "del c11[a:len(c11)]\n",
    "del c12[a:len(c12)]\n",
    "del c13[a:len(c13)]\n",
    "del c14[a:len(c14)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c00 = np.array(c0)\n",
    "c11 = np.array(c1)\n",
    "c22 = np.array(c2)\n",
    "c33 = np.array(c3)\n",
    "c44 = np.array(c4)\n",
    "c55 = np.array(c5)\n",
    "c66 = np.array(c6)\n",
    "c77 = np.array(c7)\n",
    "c88 = np.array(c8)\n",
    "c99 = np.array(c9)\n",
    "c1010 = np.array(c10)\n",
    "c1111 = np.array(c11)\n",
    "c1212 = np.array(c12)\n",
    "c1313 = np.array(c13)\n",
    "c1414 = np.array(c14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "q0 = np.concatenate((c00,c11,c22,c33,c44,c55,c66,c77,c88,c99,c1010,c1111,c1212,c1313,c1414),axis=0)\n",
    "q0.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#创建一个全零数组为正常类的标签\n",
    "label_zc = np.zeros((q0.shape[0],), dtype=int)\n",
    "label_zc.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "import time\n",
    "time_start = time.time()\n",
    "\n",
    "estimator = KMeans(n_clusters=10)\n",
    "# estimator.fit(data0)  # 聚类\n",
    "estimator.fit(data1)\n",
    "\n",
    "time_end = time.time()\n",
    "time = time_end - time_start\n",
    "print(\"time:\",time)\n",
    "\n",
    "# label_pred_0 = estimator.labels_  # 获取聚类标签\n",
    "label_pred_1 = estimator.predict(data1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted(Counter(label_pred_1).items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_pred = label_pred_1\n",
    "\n",
    "c0 = []\n",
    "c1 = []\n",
    "c2 = []\n",
    "c3 = []\n",
    "c4 = []\n",
    "c5 = []\n",
    "c6 = []\n",
    "c7 = []\n",
    "c8 = []\n",
    "c9 = []\n",
    "c10 = []\n",
    "c11 = []\n",
    "c12 = []\n",
    "c13 = []\n",
    "c14 = []\n",
    "\n",
    "# s0=s1=s2=s3=s4=s5=s6=s7=s8=s9=0\n",
    "s0=s1=s2=s3=s4=s5=s6=s7=s8=s9=s10=s11=s12=s13=s14=0\n",
    "\n",
    "for i in range(data1.shape[0]):\n",
    "    if label_pred[i] == 0:\n",
    "        c0.append(data1[i])\n",
    "        s0=s0+1\n",
    "    elif label_pred[i] == 1:\n",
    "        c1.append(data1[i])\n",
    "        s1=s1+1\n",
    "    elif label_pred[i] == 2:\n",
    "        c2.append(data1[i])\n",
    "        s2=s2+1\n",
    "    elif label_pred[i] == 3:\n",
    "        c3.append(data1[i])\n",
    "        s3=s3+1\n",
    "    elif label_pred[i] == 4:\n",
    "        c4.append(data1[i])\n",
    "        s4=s4+1\n",
    "    elif label_pred[i] == 5:\n",
    "        c5.append(data1[i])\n",
    "        s5=s5+1\n",
    "    elif label_pred[i] == 6:\n",
    "        c6.append(data1[i])\n",
    "        s6=s6+1\n",
    "    elif label_pred[i] == 7:\n",
    "        c7.append(data1[i])\n",
    "        s7=s7+1\n",
    "    elif label_pred[i] == 8:\n",
    "        c8.append(data1[i])\n",
    "        s8=s8+1\n",
    "    elif label_pred[i] == 9:\n",
    "        c9.append(data1[i])\n",
    "        s9=s9+1\n",
    "    elif label_pred[i] == 10:\n",
    "        c10.append(data1[i])\n",
    "        s10=s10+1\n",
    "    elif label_pred[i] == 11:\n",
    "        c11.append(data1[i])\n",
    "        s11=s11+1\n",
    "    elif label_pred[i] == 12:\n",
    "        c12.append(data1[i])\n",
    "        s12=s12+1\n",
    "    elif label_pred[i] == 13:\n",
    "        c13.append(data1[i])\n",
    "        s13=s13+1\n",
    "    elif label_pred[i] == 14:\n",
    "        c14.append(data1[i])\n",
    "        s14=s14+1\n",
    "        \n",
    "a=15191"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"删除前大小:\",len(c1))\n",
    "del c1[a:len(c1)]\n",
    "print(\"删除后大小:\",len(c1))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del c2[a:len(c2)]\n",
    "print(\"删除前大小:\",len(c4))\n",
    "del c4[a:len(c4)]\n",
    "print(\"删除后大小:\",len(c4))\n",
    "print(\"删除前大小:\",len(c6))\n",
    "del c6[a:len(c6)]\n",
    "print(\"删除后大小:\",len(c6))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "c0 = np.array(c0)\n",
    "c1 = np.array(c1)\n",
    "c2 = np.array(c2)\n",
    "c3 = np.array(c3)\n",
    "c4 = np.array(c4)\n",
    "c5 = np.array(c5)\n",
    "c6 = np.array(c6)\n",
    "c7 = np.array(c7)\n",
    "c8 = np.array(c8)\n",
    "c9 = np.array(c9)\n",
    "c10 = np.array(c10)\n",
    "c11 = np.array(c11)\n",
    "c12 = np.array(c12)\n",
    "c13 = np.array(c13)\n",
    "c14 = np.array(c14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "q1 = np.concatenate((c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14),axis=0)\n",
    "q1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#创建一个数组标签\n",
    "label_1 = np.ones((q1.shape[0],), dtype=int)\n",
    "\n",
    "label_1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#将正常类数据和攻击数据合并到一起，将标签也合并到一起\n",
    "data_end = np.concatenate((q0,q1,data2),axis=0)\n",
    "label_end = np.concatenate((label_zc,label_1,label214),axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_end.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted(Counter(label_end).items())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_end = label_end.reshape(label_end.shape[0],1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#将最终采样后的数据保存成文件\n",
    "np.save(\"E:/IDS/cicdata/K-means+SMOTE/data.npy\",data_end)\n",
    "np.save(\"E:/IDS/cicdata/K-means+SMOTE/label.npy\",label_end)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
